“Files Used:”

Z:/COVID-19_WastewaterAnalysis/data/processed/MMSD_Interceptor_Cases_2_7_22.csv

Z:/COVID-19_WastewaterAnalysis/data/processed/LIMSWasteData_02-09-22.csv

## [1] "Madison"  "MMSD-P2"  "MMSD-P7"  "MMSD-P8"  "MMSD-P11" "MMSD-P18"

1 Data: The first look

The two data sets used in this analysis are the Madison case data sourced from the Wisconsin DHS and wastewater concentration data produced by the Wisconsin State Laboratory of Hygiene. This wastewater data has entries every couple of days from 15 September 2020 to 25 January 2022.

Date Site FirstConfirmed SevenDayMACases N1 BCoV N2 PMMoV GeoMeanN12 FlowRate temperature equiv_sewage_amt
2021-01-11 MMSD-P11 15 13.28571 112506 0.76 141198 50800078 126038.2 8.98 NA 1
2021-01-11 MMSD-P18 22 12.71429 295775 1.74 356586 15894033 324760.3 12.00 NA 1
2021-01-11 MMSD-P2 11 16.14286 216079 1.79 249714 69705020 232288.5 5.22 NA 1
2021-01-11 MMSD-P7 6 18.28571 122744 1.21 195988 39362943 155101.1 4.07 NA 1
2021-01-11 MMSD-P8 12 20.28571 152907 1.92 194029 28159113 172245.2 6.04 NA 1
2021-01-14 MMSD-P11 32 27.14286 147601 3.39 158432 187180326 152920.6 8.85 NA 1

The case data has a strong weekend effect so for this section we look at a seven day smoothing of cases. The simple display of the data shows the core components of this story. First, wastewater data is noisy. And that there is a clear relationship between the two signals.

FirstImpresionFunc <- function(DF){
  FirstImpressionDF <- DF%>%
    NoNa(SelectedIndVar,SelectedDepVar)#Removing NA
  FirstImpressionGGplot = FirstImpressionDF%>%
  ggplot(aes(x=Date))+#Data depends on time
  geom_point(aes(y=!!sym(SelectedDepVar), color=SelectedDepVar,info=!!sym(SelectedDepVar)),size = 1)+
  geom_line(aes(y=MinMaxFixing(!!sym(SelectedIndVar),!!sym(SelectedDepVar)), 
                color=SelectedIndVar,
                info=!!sym(SelectedIndVar)))+#compares SelectedIndVar to Cases
  geom_line(aes(y=(SevenDayMACases), 
                color="Seven Day MA Cases",
                info=SevenDayMACases))+
  labs(y="Reported cases")+
    facet_wrap(~Site)
  return(ggplotly(FirstImpressionGGplot))
}
FirstImpresionFunc(filter(FullDF,Site=="Madison"))
FirstImpresionFunc(filter(FullDF,Site=="MMSD-P2"))
FirstImpresionFunc(filter(FullDF,Site=="MMSD-P7"))
FirstImpresionFunc(filter(FullDF,Site=="MMSD-P8"))
FirstImpresionFunc(filter(FullDF,Site=="MMSD-P11"))
FirstImpresionFunc(filter(FullDF,Site=="MMSD-P18"))

2 Removing potential outliers

Looking at the wastewater measurements we observe there were some points many times larger than adjacent values hinting at them being outliers. We used the adjacent 10 values on each side and marked points 2.5 standard deviations away from the group mean as outliers.

OutlierRemoveFunc <- function(DF){
  ErrorMarkedDF <- DF%>%#
    mutate(FlagedOutliers = IdentifyOutliers(!!sym(SelectedIndVar),Bin = 21, Action = "Flag"),
           #Manual flagging that method misses due to boundary effect with binning
           FlagedOutliers = ifelse(Date == mdy("01/26/2021") | Date == mdy("01/27/2021"),
                                   TRUE, FlagedOutliers),
           NoOutlierVar = ifelse(FlagedOutliers, NA, !!sym(SelectedIndVar)))
  return(ErrorMarkedDF)
}
MadDF <- OutlierRemoveFunc(filter(FullDF,Site=="Madison"))
P2DF <- OutlierRemoveFunc(filter(FullDF,Site=="MMSD-P2"))
P7DF <- OutlierRemoveFunc(filter(FullDF,Site=="MMSD-P7"))
P8DF <- OutlierRemoveFunc(filter(FullDF,Site=="MMSD-P8"))
P11DF <- OutlierRemoveFunc(filter(FullDF,Site=="MMSD-P11"))
P18DF <- OutlierRemoveFunc(filter(FullDF,Site=="MMSD-P18"))

OutlierPlotFunc <- function(DF){
  #Split N1 into outlier and non outlier for next ggplot
OutLierPlotDF <- DF%>%
  mutate(!!OutlierName := ifelse(FlagedOutliers,!!sym(SelectedIndVar),NA))%>%
           mutate(!!SelectedIndVar := NoOutlierVar)

OutLierPlotObject <- OutLierPlotDF%>%
  filter(!(is.na(!!sym(SelectedIndVar))&is.na(!!sym(OutlierName))))%>%
  ggplot(aes(x=Date))+#Data depends on time 
  geom_line(aes(y=!!sym(SelectedIndVar),
                color=SelectedIndVar, 
                info = !!sym(SelectedIndVar)))+#compares Var to Cases
  geom_point(aes(y=!!sym(OutlierName),
                 color=OutlierName,
                 info = !!sym(OutlierName)))+
  ColorRule+
    facet_wrap(~Site)

#mentioned hand picked list other choices
ReturnPlot <- ggplotly(OutLierPlotObject,tooltip=c("info","Date"))%>%
    layout(yaxis = SecondAxisFormat,
           legend=list(title=list(text=''),x = 1.15, y = 0.9),
           margin = list(l = 50, r = 75, b = 50, t = 50, pad = 4))
return(ReturnPlot)
}
OutlierPlotFunc(MadDF)
OutlierPlotFunc(P2DF)
OutlierPlotFunc(P7DF)
OutlierPlotFunc(P8DF)
OutlierPlotFunc(P11DF)
OutlierPlotFunc(P18DF)
PrepOutlierFunc <- function(DF){
  #Drop Var create Var filter 
UpdatedDF <- DF%>%
  select(-sym(SelectedIndVar))%>%
  rename(!!sym(SelectedIndVar) := NoOutlierVar)
return(UpdatedDF)
}

#MadDF <- PrepOutlierFunc(MadDF)
#P2DF <- PrepOutlierFunc(P2DF)
#P7DF <- PrepOutlierFunc(P7DF)
#P8DF <- PrepOutlierFunc(P8DF)
#P11DF <- PrepOutlierFunc(P11DF)
#P18DF <- PrepOutlierFunc(P18DF)


#FullDF

DetectedMain <- MadDF%>%
  filter(FlagedOutliers)%>%
  select(Date,FlagedOutliers)

StevePlot <- FullDF%>%
  ggplot()+
  aes(x=Date)+
  geom_vline(aes(xintercept=Date),color="red",data=DetectedMain)+
  geom_point(aes(y=N1))+
  facet_wrap(~Site, scale = "free_y")

StevePlot

A <- MadDF%>%
  filter(FlagedOutliers)
B <- P2DF%>%
  filter(FlagedOutliers)
C <- P7DF%>%
  filter(FlagedOutliers)
D <- P8DF%>%
  filter(FlagedOutliers)
E <- P11DF%>%
  filter(FlagedOutliers)
fF <- P18DF%>%
  filter(FlagedOutliers)

AllErrors <- full_join(full_join(full_join(full_join(full_join(A,B),C),D),E),fF)

AllErrors
##          Date     Site FirstConfirmed SevenDayMACases       N1   BCoV       N2
## 1  2021-01-26  Madison             56       89.428571  1831813  6.930  1686654
## 2  2021-01-27  Madison            162       88.857143  1357329  2.059  1323010
## 3  2021-04-15  Madison             56       50.285714   571269  3.684   706418
## 4  2021-04-26  Madison             NA       39.333333   675748 16.079   546554
## 5  2021-05-02  Madison             26       37.333333   876165  4.710  1134521
## 6  2021-05-14  Madison             37       23.500000   324203  5.070   465104
## 7  2021-06-06  Madison             NA        8.000000   658991  9.570   647639
## 8  2021-06-08  Madison              8        8.000000  1459947  0.982  1464354
## 9  2021-07-26  Madison             24       43.333333   186460  3.589   143034
## 10 2021-07-28  Madison             62       48.428571   260334 27.638   207880
## 11 2021-10-17  Madison             58       56.428571   823463  1.686   693348
## 12 2021-11-26  Madison             55      104.600000  1705574  6.299  1758779
## 13 2021-12-13  Madison            144      229.333333  1498471 13.886  1515771
## 14 2022-01-05  Madison            452      346.285714  4630522 32.666  5712808
## 15 2022-01-23  Madison            420      569.714286 17355259  1.945 13331342
## 16 2021-01-26  MMSD-P2             22       16.428571       NA     NA       NA
## 17 2021-01-27  MMSD-P2             48       26.714286       NA     NA       NA
## 18 2021-01-26  MMSD-P7              3       18.428571       NA     NA       NA
## 19 2021-01-27  MMSD-P7             15       28.142857       NA     NA       NA
## 20 2021-01-26  MMSD-P8              7       23.428571       NA     NA       NA
## 21 2021-01-27  MMSD-P8             24       25.428571       NA     NA       NA
## 22 2021-01-26 MMSD-P11             13       10.000000       NA     NA       NA
## 23 2021-01-27 MMSD-P11             56       24.142857       NA     NA       NA
## 24 2021-01-26 MMSD-P18             10        9.142857       NA     NA       NA
## 25 2021-01-27 MMSD-P18             18       24.428571       NA     NA       NA
##       PMMoV GeoMeanN12 FlowRate temperature equiv_sewage_amt FlagedOutliers
## 1  29748189  1757735.7    37.74          NA            1.000           TRUE
## 2  30102158  1340059.6    36.17          NA            1.000           TRUE
## 3  23628196   635259.6    39.02          NA            1.000           TRUE
## 4  28619982   607727.5    36.04          NA            0.250           TRUE
## 5  42099107   997009.3    35.23          NA            0.250           TRUE
## 6  34292122   388314.5    36.12          NA            0.625           TRUE
## 7  32185831   653290.3    34.39          NA            1.000           TRUE
## 8  54673826  1462148.8    36.40          NA            1.000           TRUE
## 9  30230924   163309.9    34.93          NA            1.000           TRUE
## 10 37320637   232633.3    36.82          NA            1.000           TRUE
## 11 29427968   755610.0    35.38          NA            0.625           TRUE
## 12 26466528  1731972.2    32.48          NA            0.625           TRUE
## 13 24525524  1507096.2    35.78          NA            0.625           TRUE
## 14 15863845  5143275.5    34.46          NA            0.625           TRUE
## 15 25317297 15210815.0    34.13          NA            0.625           TRUE
## 16       NA         NA       NA          NA               NA           TRUE
## 17       NA         NA       NA          NA               NA           TRUE
## 18       NA         NA       NA          NA               NA           TRUE
## 19       NA         NA       NA          NA               NA           TRUE
## 20       NA         NA       NA          NA               NA           TRUE
## 21       NA         NA       NA          NA               NA           TRUE
## 22       NA         NA       NA          NA               NA           TRUE
## 23       NA         NA       NA          NA               NA           TRUE
## 24       NA         NA       NA          NA               NA           TRUE
## 25       NA         NA       NA          NA               NA           TRUE
##    NoOutlierVar
## 1            NA
## 2            NA
## 3            NA
## 4            NA
## 5            NA
## 6            NA
## 7            NA
## 8            NA
## 9            NA
## 10           NA
## 11           NA
## 12           NA
## 13           NA
## 14           NA
## 15           NA
## 16           NA
## 17           NA
## 18           NA
## 19           NA
## 20           NA
## 21           NA
## 22           NA
## 23           NA
## 24           NA
## 25           NA
#On a given day we have 5 intercepters

OutlierPlotFunc(MadDF)
OutlierPlotFunc(P2DF)
OutlierPlotFunc(P7DF)
OutlierPlotFunc(P8DF)
OutlierPlotFunc(P11DF)
OutlierPlotFunc(P18DF)
AllErrors%>%
  ggplot()+
  aes(x=Date)+
  geom_point(aes(y=N1,color=Site),size=2)+
  geom_vline(xintercept = ymd("2021-04-26"))+
  geom_vline(xintercept = ymd("2021-11-18"))+
  facet_wrap(~Site, scale = "free_y")

AllErrors%>%
  filter(!is.na(N1))%>%
  group_by(Date)%>%
  summarize(n())
## # A tibble: 15 x 2
##    Date       `n()`
##    <date>     <int>
##  1 2021-01-26     1
##  2 2021-01-27     1
##  3 2021-04-15     1
##  4 2021-04-26     1
##  5 2021-05-02     1
##  6 2021-05-14     1
##  7 2021-06-06     1
##  8 2021-06-08     1
##  9 2021-07-26     1
## 10 2021-07-28     1
## 11 2021-10-17     1
## 12 2021-11-26     1
## 13 2021-12-13     1
## 14 2022-01-05     1
## 15 2022-01-23     1
#"2021-01-26" 
#"2021-06-06"
#"2021-06-08"
#"2021-10-17"
#"2021-11-26"

"2021-04-26"
## [1] "2021-04-26"
"2021-11-18"
## [1] "2021-11-18"
#P2 error seen in madison: maybe?
#P7 error seen in madison: maybe?
#P8 error seen in madison: no
#P8 error seen in madison: no
#P18 error seen in madison: yes, sometimes
LoessPlotFunc <- function(DF,SpanConstant = .163){
  MainDF <- DF
  MainDF[[loessVar]] <- loessFit(y=(MainDF[[SelectedIndVar]]), 
                      x=MainDF$Date, #create loess fit of the data
                      span=SpanConstant, #span of .2 seems to give the best result, not rigorously chosen
                      iterations=2)$fitted#2 iterations remove some bad patterns
  
  MainDF <- MainDF%>%
    NoNa(loessVar,"SevenDayMACases")
  SLDLoessGraphic <- MainDF%>%
    ggplot(aes(x=Date))+
    geom_line(aes(y=!!sym(SelectedDepVar), color=SelectedDepVar , info = !!sym(SelectedDepVar)),alpha=.1)+
    geom_line(aes(y=MinMaxFixing(!!sym(SelectedIndVar),!!sym(SelectedDepVar)),
                color=SelectedIndVar,
                info = !!sym(SelectedIndVar)),
            alpha=.2)+
    geom_line(aes(y=SevenDayMACases,
                color="Seven Day MA Cases" , 
                info = SevenDayMACases))+
    geom_line(aes(y=MinMaxFixing(!!sym(loessVar),!!sym(SelectedDepVar),!!sym(SelectedIndVar)), 
                color=loessVar ,
                info = !!sym(loessVar)))+
    facet_wrap(~Site)+
    labs(y="Reported cases")


ReturnPlot <- ggplotly(SLDLoessGraphic,tooltip=c("info","Date"))%>%
    add_lines(x=~Date, y=MainDF[[SelectedIndVar]],
            yaxis="y2", data=MainDF, showlegend=FALSE, inherit=FALSE) %>%
    layout(yaxis2 = SecondAxisFormat,
           legend=list(title=list(text=''),x = 1.15, y = 0.9),
           margin = list(l = 50, r = 75, b = 50, t = 50, pad = 4))
return(ReturnPlot)
}


LoessPlotFunc(MadDF)
LoessPlotFunc(P2DF)
LoessPlotFunc(P7DF)
LoessPlotFunc(P8DF)
LoessPlotFunc(P11DF)
LoessPlotFunc(P18DF)
#MMSD-P18/P7
#2021-04-15